In [1]:
import warnings
warnings.filterwarnings("ignore")

import os

import pandas as pd
import numpy as np
import plotly.express as px

import missingno as msno
In [2]:
#################################################################
##### Setting pandas display options for pretty print
#################################################################

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
pd.set_option('display.max_colwidth', 50)

Constant parameters¶

In [3]:
# COVID data URL picked from https://github.com/owid/covid-19-data/tree/master/public/data
COVID_DATA_URL = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
COVID_DATA_FILE_NAME = "owid-covid-data.csv"

COVID_DATA_DICTIONARY_URL = "https://covid.ourworldindata.org/data/owid-covid-codebook.csv"
COVID_DATA_DICTIONARY_FILE_NAME = "owid-covid-codebook.csv"

RESULTS_PATH = "EDS_Covid19_Exp_results"

DATA_FOLDER = "data"

Generated Parameters¶

In [4]:
covid_data_full_file_path = os.path.join(RESULTS_PATH, 
                                         DATA_FOLDER, 
                                         COVID_DATA_FILE_NAME)

covid_data_dictionary_full_file_path = os.path.join(RESULTS_PATH, 
                                                    DATA_FOLDER, 
                                                    COVID_DATA_DICTIONARY_FILE_NAME)
In [5]:
# Creating experiment folders as required
if not os.path.isdir(RESULTS_PATH):
    os.makedirs(RESULTS_PATH)
    
if not os.path.isdir(os.path.join(RESULTS_PATH, 
                                  DATA_FOLDER)):
    os.makedirs(os.path.join(RESULTS_PATH, 
                             DATA_FOLDER))

Fetching Data¶

In [6]:
#################################################################
##### Covid data 
#################################################################

if os.path.isfile(covid_data_full_file_path):
    print("Local data file available:", covid_data_full_file_path)
    df_full_covid_data = pd.read_csv(covid_data_full_file_path, 
                                     sep=",", 
                                     header=0)
    print("Data load complete.")
else:
    print("No local data file found. Downloading from URL:", COVID_DATA_URL)
    df_full_covid_data = pd.read_csv(COVID_DATA_URL, 
                                     sep=",", 
                                     header=0)
    df_full_covid_data.to_csv(covid_data_full_file_path, 
                              sep=",", 
                              header=True, 
                              index=False)
    print("Data downloaded and saved to:", covid_data_full_file_path)

#################################################################
##### Covid data dictionary
#################################################################

if os.path.isfile(covid_data_dictionary_full_file_path):
    print("\nLocal dictionary file available:", covid_data_dictionary_full_file_path)
    df_covid_data_dict = pd.read_csv(covid_data_dictionary_full_file_path, 
                                     sep=",", 
                                     header=0)
    print("Dictionary load complete.")
else:
    print("\nNo local dictionary file found. Downloading from URL:", COVID_DATA_DICTIONARY_URL)
    df_covid_data_dict = pd.read_csv(COVID_DATA_DICTIONARY_URL, 
                                     sep=",", 
                                     header=0)
    df_covid_data_dict.to_csv(covid_data_dictionary_full_file_path, 
                              sep=",", 
                              header=True, 
                              index=False)
    print("Dictionary downloaded and saved to:", covid_data_dictionary_full_file_path)
Local data file available: EDS_Covid19_Exp_results\data\owid-covid-data.csv
Data load complete.

Local dictionary file available: EDS_Covid19_Exp_results\data\owid-covid-codebook.csv
Dictionary load complete.
In [31]:
#################################################################
##### codebook - contains details of every column specified by data provider
#################################################################

# df_covid_data_dict[['column', 'description']]

Missing data analysis¶

In [8]:
df_full_covid_data.shape
Out[8]:
(197003, 67)
In [9]:
# Required fields for Delivery 1 and 2
required_fields = ['location', 'date',
                   'total_cases', 'population', 
                   'people_vaccinated', 'people_fully_vaccinated', 'total_boosters']

Filter out continental and world data, keep only country specific data¶

In [10]:
print('Non-country locations:\n', df_full_covid_data[df_full_covid_data['continent'].isnull()]['location'].unique())
Non-country locations:
 ['Africa' 'Asia' 'Europe' 'European Union' 'High income' 'International'
 'Low income' 'Lower middle income' 'North America' 'Oceania'
 'South America' 'Upper middle income' 'World']
In [11]:
df_full_covid_data = df_full_covid_data[df_full_covid_data['continent'].notnull()].reset_index(drop=True)

Bar chart representing relative completion of the data columns:¶

In [12]:
msno.bar(df_full_covid_data)
Out[12]:
<AxesSubplot:>
In [13]:
msno.bar(df_full_covid_data[required_fields])
Out[13]:
<AxesSubplot:>

Nullity matrix, similar to the bar chart, but spatially represents the missing information in the dataframe:¶

In [14]:
msno.matrix(df_full_covid_data)
Out[14]:
<AxesSubplot:>
In [15]:
msno.matrix(df_full_covid_data[required_fields])
Out[15]:
<AxesSubplot:>

Dendogram to visualize the correlation of completion between variables:¶

In [16]:
msno.dendrogram(df_full_covid_data)
Out[16]:
<AxesSubplot:>
In [17]:
msno.dendrogram(df_full_covid_data[required_fields])
Out[17]:
<AxesSubplot:>

Analyzing missing data of vaccinations by country¶

In [18]:
df_missing_vaccination_by_country = df_full_covid_data[[required_fields[i] for i in [0, -3, -2, -1]]].drop('location', 
                                                                                                           axis=1).isna().groupby(df_full_covid_data.location).sum().reset_index()

df_missing_vaccination_by_country['location_records'] = df_full_covid_data['location'].value_counts().reset_index().sort_values(by='index')['location'].reset_index(drop=True)

df_missing_vaccination_by_country['people_vaccinated'] = df_missing_vaccination_by_country['people_vaccinated']/df_missing_vaccination_by_country['location_records']
df_missing_vaccination_by_country['people_fully_vaccinated'] = df_missing_vaccination_by_country['people_fully_vaccinated']/df_missing_vaccination_by_country['location_records']
df_missing_vaccination_by_country['total_boosters'] = df_missing_vaccination_by_country['total_boosters']/df_missing_vaccination_by_country['location_records']

df_missing_vaccination_by_country['mean_total_missing'] = (df_missing_vaccination_by_country['people_vaccinated'] + df_missing_vaccination_by_country['people_fully_vaccinated'] + df_missing_vaccination_by_country['total_boosters']) / 3
In [19]:
fig = px.bar(df_missing_vaccination_by_country, x='location', y='mean_total_missing')
fig.show()
In [20]:
df_missing_vaccination_by_country.sort_values(by='mean_total_missing', ascending=True).head(10)
Out[20]:
location people_vaccinated people_fully_vaccinated total_boosters location_records mean_total_missing
111 Latvia 0.349 0.365 0.372 852 0.362
198 Switzerland 0.366 0.366 0.377 853 0.370
218 United States 0.373 0.373 0.400 887 0.382
63 Estonia 0.382 0.382 0.392 875 0.385
117 Lithuania 0.364 0.385 0.426 849 0.392
153 Norway 0.342 0.374 0.471 853 0.396
70 France 0.385 0.386 0.419 885 0.397
54 Denmark 0.365 0.381 0.445 876 0.397
7 Argentina 0.400 0.400 0.400 908 0.400
35 Canada 0.418 0.388 0.438 886 0.415

Missing Data Analysis Insights:¶

  • Total 197003 records in data
  • Missing values in many columns. Required fields for Delivery 1 and 2 have missing values in:
    • total_cases, population, people_vaccinated, people_fully_vaccinated, total_boosters
  • Missing values are significant in vaccination data
    • Missing values often significant for many countries/region
    • Using the mean missing information from the three vaccination measures (people_vaccinated people_fully_vaccinated total_boosters), top 3 countries with least missing information selected
      • Latvia, Switzerland, United States selected in current report
  • From nullity matrix and dendogram, we can see that missing values of columns can occur in pairs (correlated)
In [21]:
country_list = list(df_missing_vaccination_by_country.sort_values(by='mean_total_missing', ascending=True)['location'].head(3))
In [22]:
df_3country_covid_data = df_full_covid_data[df_full_covid_data['location'].isin(country_list)][required_fields].reset_index(drop=True)

Missing Value Imputation¶

  • Interpolating using pandas default function, in both direction to impute all missing values.
    • Executed separately for each country
In [23]:
impute_column_list = ['total_cases', 'population', 
                      'people_vaccinated', 'people_fully_vaccinated', 'total_boosters']
In [24]:
msno.matrix(df_3country_covid_data)
Out[24]:
<AxesSubplot:>
In [25]:
df_3country_covid_data_imputed = None

for current_country in list(df_3country_covid_data['location'].unique()):
    
    temp_df_current_country_data = df_3country_covid_data[df_3country_covid_data['location'] == current_country]
    temp_df_current_country_data = temp_df_current_country_data.sort_values(by='date', ascending=True)
    
    for col in impute_column_list:
        temp_df_current_country_data[col] = temp_df_current_country_data[col].interpolate(limit_direction="both")
    
    if df_3country_covid_data_imputed is None:
        df_3country_covid_data_imputed = temp_df_current_country_data
    else:
        df_3country_covid_data_imputed = pd.concat([df_3country_covid_data_imputed, temp_df_current_country_data],
                                                   ignore_index=True
                                                  )
In [26]:
msno.matrix(df_3country_covid_data_imputed)
Out[26]:
<AxesSubplot:>

Exploratory Data Analysis¶

Delivery 1: The relative cases overtime of Covid infectors (absolute Covid cases/population size)¶

In [27]:
df_3country_covid_data_imputed['infector_percentage'] = df_3country_covid_data_imputed['total_cases']/df_3country_covid_data_imputed['population']
In [28]:
fig = px.line(df_3country_covid_data_imputed, 
              x='date', y='infector_percentage', 
              color='location',
              # title = 'The relative cases overtime of Covid infectors'
             )
fig.update_layout(
    yaxis_title="Total cases as percentage of population",
    xaxis_title="Timeline",
    title = {
        'text':'The total affected rate (percentage of the population) over time',
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    legend = {
        'title':{'text':'Location'}
    }
)
fig.show(renderer='notebook')

Delivery 2: The vaccination rate (percentage of the population) over time¶

In [29]:
df_3country_covid_data_imputed['vaccination_dose1_percentage'] = df_3country_covid_data_imputed['people_vaccinated']/df_3country_covid_data_imputed['population']
df_3country_covid_data_imputed['vaccination_dose2_percentage'] = df_3country_covid_data_imputed['people_fully_vaccinated']/df_3country_covid_data_imputed['population']
df_3country_covid_data_imputed['vaccination_dose3_percentage'] = df_3country_covid_data_imputed['total_boosters']/df_3country_covid_data_imputed['population']

df_3country_covid_data_imputed_Transformed = None

for dose in range(1,4):
    
    col_list = ['location', 'date', 'vaccination_dose{}_percentage'.format(dose)]
    temp_df = df_3country_covid_data_imputed[col_list]
    
    temp_df['location'] = temp_df['location']+', '+str(dose)
    temp_df.columns = ['location, dose', 'date', 'vaccination_percentage']
    
    if df_3country_covid_data_imputed_Transformed is None:
        df_3country_covid_data_imputed_Transformed = temp_df
    else:
        df_3country_covid_data_imputed_Transformed = pd.concat([df_3country_covid_data_imputed_Transformed, temp_df],#
                                                               ignore_index=True
                                                              )
In [30]:
fig = None
fig = px.line(df_3country_covid_data_imputed_Transformed, 
              x='date', y='vaccination_percentage', 
              color='location, dose',
             )
fig.update_layout(
    yaxis_title="Percentage of population vaccinated",
    xaxis_title="Timeline",
    title = {
        'text':'The vaccination rate (percentage of the population) over time',
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    legend = {
        'title':{'text':'Location, Dosage'}
    }
)
fig.show(renderer='notebook')
In [ ]: